Team: Mike Wisniewski, Henry Lambson, Alex Gregory
import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from PIL import Image
from tensorflow.keras import models
from tensorflow.keras.applications import VGG19
from tensorflow.keras.applications.vgg19 import preprocess_input, decode_predictions
Nearly all functions are taken from lecture with small quality of life changes - but no serious alterations to code logic. We added the "grad_cam_single_filter" function to mimic the class provided "grad_cam" function, but at a filter level (instead of the entire model).
def prepare_image_for_display(img, norm_type='max'):
if norm_type == 'max':
# min/max scaling, best for regular images
new_img = (img - img.min()) / (img.max()-img.min())
else:
# std scaling, best when we are unsure about large outliers
new_img = ((img - img.mean()) / (img.std() +1e-3))*0.15 + 0.5
new_img *= 255
new_img = np.clip(new_img, 0, 255)
if len(new_img.shape)>3:
new_img = np.squeeze(new_img)
return new_img.astype('uint8')
def load_image_as_array(url, size=(224, 224)):
# response = requests.get(url)
# img = Image.open(BytesIO(response.content))
img = Image.open(url)
img = img.resize(size)
return np.array(img).astype(float)
def generate_pattern(layer_name, filter_index, size=224):
# Build a model that outputs the activation
# of the nth filter of the layer considered.
layer_output = model.get_layer(layer_name).output
# Isolate the aoutput
new_model = models.Model(inputs=model.input, outputs=layer_output)
# We start from a gray image with some uniform noise
input_img_data = np.random.random((1, size, size, 3)) * 20 + 128.
I_start = tf.Variable(input_img_data, name='image_var', dtype = 'float64')
I = preprocess_input(I_start) # only process once
# Run gradient ascent for 40 steps
eta = 1.
for i in range(ITERATIONS):
with tf.GradientTape(watch_accessed_variables=False) as tape:
tape.watch(I)
# get variable to maximize
model_vals = new_model(I)
loss = tf.reduce_mean(model_vals[:, :, :, filter_index])
# Compute the gradient of the input picture w.r.t. this loss
# add this operation input to maximize
grad_fn = tape.gradient(loss, I)
# Normalization trick: we normalize the gradient
grad_fn /= (tf.sqrt(tf.reduce_mean(tf.square(grad_fn))) + 1e-5) # mean L2 norm
I += grad_fn * eta # one iteration of maximizing
# return the numpy matrix so we can visualize
img = I.numpy()
return img
def grad_cam(img, class_idx=None):
# The is the output feature map of the `block5_conv3` layer,
# the last convolutional layer in VGG16
# this is A_{i,j,k} (in keras tensor form) above
Aijk_as_kt = model.get_layer(LAYER_NAME).output
# make new model with access to various layers of interest
new_model = models.Model(inputs=model.input,
outputs=[Aijk_as_kt, model.output])
# create a variable with the image data
I = np.expand_dims(img, axis=0)
# now use gradient tape to get the gradients (watching only the variable v)
with tf.GradientTape(watch_accessed_variables=True) as tape:
Aijk_as_tf, fc_as_tf = new_model(I) # get outputs
if class_idx == None:
# just get maximum class, if not set by user
class_idx = np.argmax(fc_as_tf)
sum_fc = tf.reduce_mean(fc_as_tf[:, class_idx]) # define what we want to maximize
# This is the gradient of the predicted class with regard to
# the output feature map of `block5_conv3`
# this is the df / dA above
grad_fc_wrt_Aijk_as_tf = tape.gradient(sum_fc, Aijk_as_tf) # get gradients that influence loss w.r.t. last conv
# This is a vector of shape (512,), where each entry
# is the mean intensity of the gradient over a specific feature map channel
# this is the alpha values in tensor form
alpha_k_as_tf = tf.reduce_mean(grad_fc_wrt_Aijk_as_tf,
axis=(0, 1, 2))
# axis=(Batch x I x J)
# These are the values of these two quantities, as Numpy arrays,
# given our sample image
# these are the numpy outputs for alpha_k and A_{i,j,k}
alpha_k = alpha_k_as_tf.numpy() # now in numpy form
A = Aijk_as_tf.numpy() # also get the last layer as numpy
# We multiply each channel in the feature map array
# by "how important this channel is" with regard to the predicted class
# this is alpha*A before it is collapsed with the mean
for i in range(256):
A[:, :, :, i] *= alpha_k[i]
# only highlight positive influences for class
A[:, :, :, i] = np.maximum(A[:, :, :, i], 0)
# B x I x J x Chan
# The channel-wise mean of the resulting feature map
# is our heatmap of class activation
# this is our S_{i,j} above (before normalizing)
S = np.sum(A, axis=-1)
# size of S is now Batch x I x J (summed over channels)
# We then normalize the heatmap 0-1 for visualization:
# this is where we account for S_max
S /= np.max(S)
return S
def grad_cam_single_filter(img, class_idx=None):
# The is the output feature map of the `block5_conv3` layer,
# the last convolutional layer in VGG16
# this is A_{i,j,k} (in keras tensor form) above
Aijk_as_kt = model.get_layer(LAYER_NAME).output
# make new model with access to various layers of interest
new_model = models.Model(inputs=model.input,
outputs=Aijk_as_kt)
# create a variable with the image data
I = np.expand_dims(img, axis=0)
I = tf.Variable(I, name='image_var', dtype = 'float64')
I = preprocess_input(I)
# now use gradient tape to get the gradients (watching only the variable v)
with tf.GradientTape(watch_accessed_variables=True) as tape:
tape.watch(I)
Aijk_as_tf = new_model(I) # get outputs
loss = tf.reduce_mean(Aijk_as_tf[:, :, :, FILTER_INDEX])
# This is the gradient of the predicted class with regard to
# the output feature map of `block5_conv3`
# this is the df / dA above
grad_fc_wrt_Aijk_as_tf = tape.gradient(loss, Aijk_as_tf) # get gradients that influence loss w.r.t. last conv
# This is a vector of shape (512,), where each entry
# is the mean intensity of the gradient over a specific feature map channel
# this is the alpha values in tensor form
alpha_k_as_tf = tf.reduce_mean(grad_fc_wrt_Aijk_as_tf,
axis=(0, 1, 2))
# axis=(Batch x I x J)
# These are the values of these two quantities, as Numpy arrays,
# given our sample image
# these are the numpy outputs for alpha_k and A_{i,j,k}
alpha_k = alpha_k_as_tf.numpy() # now in numpy form
A = Aijk_as_tf.numpy() # also get the last layer as numpy
# We multiply each channel in the feature map array
# by "how important this channel is" with regard to the predicted class
# this is alpha*A before it is collapsed with the mean
# A[:, :, :, FILTER_INDEX] *= alpha_k[FILTER_INDEX]
# # only highlight positive influences for class
# A[:, :, :, FILTER_INDEX] = np.maximum(A[:, :, :, FILTER_INDEX], 0)
for i in range(256):
A[:, :, :, i] *= alpha_k[i]
# only highlight positive influences for class
A[:, :, :, i] = np.maximum(A[:, :, :, i], 0)
# B x I x J x Chan
# The channel-wise mean of the resulting feature map
# is our heatmap of class activation
# this is our S_{i,j} above (before normalizing)
S = np.sum(A, axis=-1)
# size of S is now Batch x I x J (summed over channels)
# We then normalize the heatmap 0-1 for visualization:
# this is where we account for S_max
S /= np.max(S)
return S
LAYER_NAME = "block3_conv3"
FILTER_INDEX = 12
IMAGE_LENGTH = 224
IMAGE_WIDTH = 224
ITERATIONS = 100
IMG_PATH = ['images/finch.jpeg', 'images/pepper.jpeg', 'images/printer.jpeg', 'images/bus.jpeg', 'images/tricycle.jpeg']
EXTENT = (0, IMAGE_LENGTH, 0, IMAGE_WIDTH)
[3 Points] In groups, you should select a convolutional neural network model that has been pre-trained on a large dataset (preferably, ImageNet). These already trained models are readily available online through many mechanisms, including the keras.application package (Inception, Xception, VGG etc.) https://keras.io/api/applications/
- It is recommended to select a model with somewhat simple structure, like VGG. This can help to simplify how to extract specific filters and inputs to filters later on.
- Explain the model you chose and why. Classify a few images with pre-trained network to verify that it is working properly.
Our group elected to use VGG19 as our pretrained model to investigate. We felt that VGG19 would be a good model to look at because of its high layer count. Since we want to investigate filters in the middle of a model, we felt that a model with more layers would be a good example. VGG19 is strong at classifying simple objects, so it would be easy to find images for us to test on the filters. VGG19 is included in OpenAI's microscope, so we are able to easily verify if the patterns we create for the filters are accurate. We chose to look into the "block3_conv3" layer because it is right in the middle of the model. From this layer, we chose a random filter to investigate, filter 12.
As shown in the example images passed through the model, VGG19 is able to accurately classify each of the pictures with confidences ranging from aproximately 82% to almost 100%. The model is clearly functioning correctly. To get an idea of what the model is looking for when it classifies an image, we plotted what the model was activated most by in each image in heatmaps. Below is the code for this analysis and we will elaborate on each image as they come up.
Test images used are from Imagenet and were found here: https://www.kaggle.com/datasets/lijiyu/imagenet
# Load the pre-trained VGG16 model
model = VGG19(weights='imagenet', include_top=True, input_tensor=None)
# set VGG to be frozen
for layer in model.layers:
layer.trainable = False
model.summary()
Model: "vgg19"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
input_1 (InputLayer) [(None, 224, 224, 3)] 0
block1_conv1 (Conv2D) (None, 224, 224, 64) 1792
block1_conv2 (Conv2D) (None, 224, 224, 64) 36928
block1_pool (MaxPooling2D) (None, 112, 112, 64) 0
block2_conv1 (Conv2D) (None, 112, 112, 128) 73856
block2_conv2 (Conv2D) (None, 112, 112, 128) 147584
block2_pool (MaxPooling2D) (None, 56, 56, 128) 0
block3_conv1 (Conv2D) (None, 56, 56, 256) 295168
block3_conv2 (Conv2D) (None, 56, 56, 256) 590080
block3_conv3 (Conv2D) (None, 56, 56, 256) 590080
block3_conv4 (Conv2D) (None, 56, 56, 256) 590080
block3_pool (MaxPooling2D) (None, 28, 28, 256) 0
block4_conv1 (Conv2D) (None, 28, 28, 512) 1180160
block4_conv2 (Conv2D) (None, 28, 28, 512) 2359808
block4_conv3 (Conv2D) (None, 28, 28, 512) 2359808
block4_conv4 (Conv2D) (None, 28, 28, 512) 2359808
block4_pool (MaxPooling2D) (None, 14, 14, 512) 0
block5_conv1 (Conv2D) (None, 14, 14, 512) 2359808
block5_conv2 (Conv2D) (None, 14, 14, 512) 2359808
block5_conv3 (Conv2D) (None, 14, 14, 512) 2359808
block5_conv4 (Conv2D) (None, 14, 14, 512) 2359808
block5_pool (MaxPooling2D) (None, 7, 7, 512) 0
flatten (Flatten) (None, 25088) 0
fc1 (Dense) (None, 4096) 102764544
fc2 (Dense) (None, 4096) 16781312
predictions (Dense) (None, 1000) 4097000
=================================================================
Total params: 143,667,240
Trainable params: 0
Non-trainable params: 143,667,240
_________________________________________________________________
# load in an image
for image in IMG_PATH:
img = load_image_as_array(image, size=(IMAGE_LENGTH, IMAGE_WIDTH))
# create image tensor and preprocess image for prediction
img_tensor = np.expand_dims(img, axis=0)
img_tensor = preprocess_input(img_tensor)
preds = model.predict(img_tensor)
# plot impage
print(img_tensor.shape)
print('Predicted:', decode_predictions(preds, top=3)[0])
plt.imshow(prepare_image_for_display(img_tensor))
plt.show()
(1, 224, 224, 3)
Predicted: [('n01531178', 'goldfinch', 0.99991596), ('n01530575', 'brambling', 2.5116111e-05), ('n01560419', 'bulbul', 1.3279246e-05)]
(1, 224, 224, 3)
Predicted: [('n07720875', 'bell_pepper', 0.9940262), ('n03461385', 'grocery_store', 0.0050144102), ('n07718472', 'cucumber', 0.0006438915)]
(1, 224, 224, 3)
Predicted: [('n03924679', 'photocopier', 0.98173565), ('n04004767', 'printer', 0.018126706), ('n04554684', 'washer', 0.00011148214)]
(1, 224, 224, 3)
Predicted: [('n04146614', 'school_bus', 0.8263167), ('n04252225', 'snowplow', 0.1560661), ('n03417042', 'garbage_truck', 0.0043642567)]
(1, 224, 224, 3)
Predicted: [('n04482393', 'tricycle', 0.9033622), ('n03444034', 'go-kart', 0.049837064), ('n03649909', 'lawn_mower', 0.024878366)]
As evident, VGG19 is able to accurately predict all images with high probability. Although the school bus picture had the lowest probability, there is no concern for accuracy as 82% is still very good. One interesting point that we observed is the school bus picture contains other structures in the background. As a fun hypothesis, our team thinks that some filters will become more excited by background objects than by the school bus itself. Although we don't know if our specific filter will have this excitement, we won't rule out that possibility.
# iterate through each image and visually identify photos that the model predicts well
for image in IMG_PATH:
# load in image
img = load_image_as_array(image, size=(IMAGE_LENGTH, IMAGE_WIDTH))
# create image tensor and preprocess image for prediction
img_tensor = np.expand_dims(img, axis=0)
img_tensor = preprocess_input(img_tensor)
# predict image
preds = model.predict(img_tensor)
# extract labels
labels = [label[1] for label in decode_predictions(preds, top=3)[0]]
# getting a list of predicted classes
predicted_class = np.argmax(preds)
# extract top 3 indices
preds = np.squeeze(preds)
topk_idx = np.argsort(preds)[-3:]
# get class areas for each of top k predictions
all_S = [grad_cam(img, class_idx=idx) for idx in topk_idx]
# show each heatmap
plt.figure(figsize=(15,5))
for i,idx in enumerate(topk_idx):
plt.subplot(1,3,i+1)
plt.imshow(np.squeeze(all_S[i]))
plt.title(labels[i])
# show overlay of each heatmap
plt.figure(figsize=(15,5),frameon=False)
for i,idx in enumerate(topk_idx):
plt.subplot(1,3,i+1)
plt.imshow(img/255, interpolation='nearest', extent=EXTENT)
# use jet here for color mapping, even though I hate jet
plt.imshow(np.squeeze(all_S[i]), cmap=plt.cm.jet, alpha=0.4,
interpolation='bilinear',
extent=EXTENT)
plt.title(labels[i])
plt.show()
Clipping input data to the valid range for imshow with RGB data ([0..1] for floats or [0..255] for integers). Clipping input data to the valid range for imshow with RGB data ([0..1] for floats or [0..255] for integers). Clipping input data to the valid range for imshow with RGB data ([0..1] for floats or [0..255] for integers).
Clipping input data to the valid range for imshow with RGB data ([0..1] for floats or [0..255] for integers). Clipping input data to the valid range for imshow with RGB data ([0..1] for floats or [0..255] for integers). Clipping input data to the valid range for imshow with RGB data ([0..1] for floats or [0..255] for integers).
Clipping input data to the valid range for imshow with RGB data ([0..1] for floats or [0..255] for integers). Clipping input data to the valid range for imshow with RGB data ([0..1] for floats or [0..255] for integers). Clipping input data to the valid range for imshow with RGB data ([0..1] for floats or [0..255] for integers).
Clipping input data to the valid range for imshow with RGB data ([0..1] for floats or [0..255] for integers). Clipping input data to the valid range for imshow with RGB data ([0..1] for floats or [0..255] for integers). Clipping input data to the valid range for imshow with RGB data ([0..1] for floats or [0..255] for integers).
Clipping input data to the valid range for imshow with RGB data ([0..1] for floats or [0..255] for integers). Clipping input data to the valid range for imshow with RGB data ([0..1] for floats or [0..255] for integers). Clipping input data to the valid range for imshow with RGB data ([0..1] for floats or [0..255] for integers).
In terms of the overall model, excitement across class predictions are very similar for each picture. For the finch, there is strong evidence to suggest that the edge and tip of the wing is a distinguishing feature for the guess "golden finch", whereas the brambling focuses near the center of the wing and the bulbul is excited about the wing and underbelly of the bird. For the bell pepper, we were surprised by excitement on the edge of the photo - but this suggests there are features within these bell peppers that are distinguishable from other types of classes (grocery store, cucumber). Also to note, there are excited areas on the bulbs, or tops, of the bell peppers - providing evidence to support that these are features the model tries to identify. For the photocopier, we were surprised that the interface was not as excitable as we anticipated. This, along with the excitement at the tray end of the photocopier suggests that the excited features to distinguish a photocopier are the notable paper trays at the end of the photocopier. For school bus, the excited features are what we expected it to be. For the tricycle, there is evidence to support that the model looks for a specific wheel nut in order to classify a tricycle.
This preliminary analysis for our model is bedrock to our further analysis below and we will be using the above framework to explore further into VGG19.
[4 Points] Select a multi-channel filter (i.e., a feature) in a layer in which to analyze as part of a circuit. This should be a multi-channel filter in a "mid-level" portion of the network (that is, there are a few convolutional layers before and after this chosen layer). You might find using OpenAI microscope a helpful tool for selecting a filter to analyze without writing too much code: https://microscope.openai.com/models/
- Using image gradient techniques, find an input image that maximally excites this chosen multi-channel filter. General techniques are available from class: https://github.com/8000net/LectureNotesMaster/blob/master/04%20LectureVisualizingConvnets.ipynb
- Also send images of varying class (i.e. from ImageNet) through the network and track which classes of images most excite your chosen filter.
- Give a hypothesis for what this multi-channel filter might be extracting. That is, what do you think its function is in the network?
- If using code from another source, you must heavily document the code so that I can grade your understanding of the code used.
In this section, we explore which images maximally excite our given filter, which set of images and classes from a set of test images maximally excite our filter and what our hypothesis is for filter extraction.
First, we generate an image that maximally excited our filter and compare to OpenAI's Microscope. We use the generate_pattern function from class. Analysis to follow
# for the 12th filter in block 3, conv 3 - what image excites this filter?
excited_image = generate_pattern(LAYER_NAME, FILTER_INDEX, size=IMAGE_LENGTH)
excited_image = prepare_image_for_display(excited_image, norm_type='std')
plt.imshow(excited_image)
plt.show()
OpenAI Microscope VGG19 Unit 12 Image

Here is the input image that maximally excites filter 12 in block 3 convolution 3. Based on this image, we hypothesize that this filter is trained to activate when it finds diagonal lines going upward from left to right. This image shares similarities to the images classified as lines in the Zoom In article (https://distill.pub/2020/circuits/zoom-in/), which led us to the hypothesis that this filter is looking for diagonal lines. Additionally, compared to OpenAI Microscope pictured, we believe that we are calculating the input image that maximally excites this filter correctly.
for layer_name in [LAYER_NAME]:
NUM_CHANNELS = 3
NUM_FILTERS = 256
HEIGHT = int(NUM_FILTERS**(1/2))
WIDTH = int(HEIGHT - 1)
margin = 5
size = IMAGE_LENGTH
# This a empty (black) image where we will store our results.
results = np.zeros((HEIGHT * size + WIDTH * margin, HEIGHT * size + WIDTH * margin, NUM_CHANNELS)).astype('uint8')
for i in range(HEIGHT): # iterate over the rows of our results grid
for j in range(HEIGHT): # iterate over the columns of our results grid
# Generate the pattern for filter `i + (j * 8)` in `layer_name`
filter_img = generate_pattern(layer_name, i + (j * HEIGHT), size=size)
filter_img = prepare_image_for_display(filter_img, norm_type='std')
# Put the result in the square `(i, j)` of the results grid
horizontal_start = i * size + i * margin
horizontal_end = horizontal_start + size
vertical_start = j * size + j * margin
vertical_end = vertical_start + size
results[horizontal_start: horizontal_end, vertical_start: vertical_end, :] = filter_img
# Display the results grid
plt.figure(figsize=(40, 40))
plt.imshow(results)
plt.title(layer_name)
plt.show()